library(tidyverse)
library(scales)
Fisheries and Aquaculture Department of the Food and Agriculture Organization of the United Nations collects data on fisheries production of countries. The (not-so-great) visualization below shows the distribution of fishery harvest of countries for 2016, by capture and aquaculture.
Question: What are some ways you would improve this visualization?
Let’s load the data:
fisheries <- read_csv("data/fisheries.csv")
## Parsed with column specification:
## cols(
## country = col_character(),
## capture = col_double(),
## aquaculture = col_double(),
## total = col_double()
## )
names(fisheries)
## [1] "country" "capture" "aquaculture" "total"
And inspect it:
fisheries
## # A tibble: 216 x 4
## country capture aquaculture total
## <chr> <dbl> <dbl> <dbl>
## 1 Afghanistan 1000 1200 2200
## 2 Albania 7886 950 8836
## 3 Algeria 95000 1361 96361
## 4 American Samoa 3047 20 3067
## 5 Andorra 0 0 0
## 6 Angola 486490 655 487145
## 7 Antigua and Barbuda 3000 10 3010
## 8 Argentina 755226 3673 758899
## 9 Armenia 3758 16381 20139
## 10 Aruba 142 0 142
## # … with 206 more rows
Filter out countries whose total harvest was less than 100,000 tons since they are not included in the visualization:
fisheries <- fisheries %>%
filter(total > 100000)
fisheries
## # A tibble: 82 x 4
## country capture aquaculture total
## <chr> <dbl> <dbl> <dbl>
## 1 Angola 486490 655 487145
## 2 Argentina 755226 3673 758899
## 3 Australia 174629 96847 271476
## 4 Bangladesh 1674770 2203554 3878324
## 5 Brazil 705000 581230 1286230
## 6 Cambodia 629950 172500 802450
## 7 Cameroon 233190 2315 235505
## 8 Canada 874727 200765 1075492
## 9 Chad 110000 94 110094
## 10 Chile 1829238 1050117 2879355
## # … with 72 more rows
Then, we will join this with the continent data.
continents <- read_csv("data/continents.csv")
## Parsed with column specification:
## cols(
## country = col_character(),
## continent = col_character()
## )
continents
## # A tibble: 245 x 2
## country continent
## <chr> <chr>
## 1 Afghanistan Asia
## 2 Ã…land Islands Europe
## 3 Albania Europe
## 4 Algeria Africa
## 5 American Samoa Oceania
## 6 Andorra Europe
## 7 Angola Africa
## 8 Anguilla Americas
## 9 Antigua & Barbuda Americas
## 10 Argentina Americas
## # … with 235 more rows
something_join(x, y)
inner_join(): all rows from x where there are matching values in y, return all combination of multiple matches in the case of multiple matchesleft_join(): all rows from xright_join(): all rows from yfull_join(): all rows from both x and yanti_join(): return all rows from x where there are not matching values in y, never duplicate rows of xFor the next bit…
x
## # A tibble: 3 x 1
## value
## <dbl>
## 1 1
## 2 2
## 3 3
y
## # A tibble: 3 x 1
## value
## <dbl>
## 1 1
## 2 2
## 3 4
inner_join()inner_join(x, y)
## Joining, by = "value"
## # A tibble: 2 x 1
## value
## <dbl>
## 1 1
## 2 2
left_join()left_join(x, y)
## Joining, by = "value"
## # A tibble: 3 x 1
## value
## <dbl>
## 1 1
## 2 2
## 3 3
right_join()right_join(x, y)
## Joining, by = "value"
## # A tibble: 3 x 1
## value
## <dbl>
## 1 1
## 2 2
## 3 4
full_join()full_join(x, y)
## Joining, by = "value"
## # A tibble: 4 x 1
## value
## <dbl>
## 1 1
## 2 2
## 3 3
## 4 4
anti_join()anti_join(x, y)
## Joining, by = "value"
## # A tibble: 1 x 1
## value
## <dbl>
## 1 3
Question: We want to keep all rows and columns from `fisheries` and add a
column for corresponding continents. Which join function should we use?
fisheries %>% select(country)
## # A tibble: 82 x 1
## country
## <chr>
## 1 Angola
## 2 Argentina
## 3 Australia
## 4 Bangladesh
## 5 Brazil
## 6 Cambodia
## 7 Cameroon
## 8 Canada
## 9 Chad
## 10 Chile
## # … with 72 more rows
continents
## # A tibble: 245 x 2
## country continent
## <chr> <chr>
## 1 Afghanistan Asia
## 2 Ã…land Islands Europe
## 3 Albania Europe
## 4 Algeria Africa
## 5 American Samoa Oceania
## 6 Andorra Europe
## 7 Angola Africa
## 8 Anguilla Americas
## 9 Antigua & Barbuda Americas
## 10 Argentina Americas
## # … with 235 more rows
fisheries <- left_join(fisheries, continents)
## Joining, by = "country"
Question: How does `left_join()` know to join the two data frames by `country`?
Hint:
## [1] "country" "capture" "aquaculture" "total"
## [1] "country" "continent"
Let’s check to make sure all countries now have a continent assigned.
fisheries %>%
filter(is.na(continent))
## # A tibble: 3 x 5
## country capture aquaculture total continent
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 Democratic Republic of the Congo 237372 3161 240533 <NA>
## 2 Hong Kong 142775 4258 147033 <NA>
## 3 Myanmar 2072390 1017644 3090034 <NA>
Nope!
We will need to manually fix some of these.
fisheries <- fisheries %>%
mutate(continent = case_when(
country == "Democratic Republic of the Congo" ~ "Africa",
country == "Hong Kong" ~ "Asia",
country == "Myanmar" ~ "Asia",
TRUE ~ continent
)
)
…and check again
fisheries %>%
filter(is.na(continent))
## # A tibble: 0 x 5
## # … with 5 variables: country <chr>, capture <dbl>, aquaculture <dbl>,
## # total <dbl>, continent <chr>
Question: What does the following code do?
fisheries <- fisheries %>%
mutate(aquaculture_perc = aquaculture / total)
fisheries_summary <- fisheries %>%
group_by(continent) %>%
summarise(
min_ap = min(aquaculture_perc),
mean_ap = mean(aquaculture_perc),
max_ap = max(aquaculture_perc)
)
fisheries_summary
## # A tibble: 5 x 4
## continent min_ap mean_ap max_ap
## <chr> <dbl> <dbl> <dbl>
## 1 Africa 0 0.0943 0.803
## 2 Americas 0 0.192 0.529
## 3 Asia 0 0.367 0.782
## 4 Europe 0.00682 0.165 0.618
## 5 Oceania 0.0197 0.150 0.357
ggplot(fisheries_summary, aes(x = continent, y = mean_ap)) +
geom_col()
ggplot(fisheries_summary,
aes(y = fct_reorder(continent, mean_ap), x = mean_ap)) +
geom_col() +
scale_x_continuous(labels = percent) + #<<
labs(
x = "", y = "",
title = "Average share of aquaculture by continent",
subtitle = "out of total fisheries harvest, 2016",
caption = "Source: bit.ly/2VrawTt"
) +
theme_minimal()
The map_data() function easily turns data from the maps package in to a data frame suitable for plotting with ggplot2:
head(map_data("world"))
## long lat group order region subregion
## 1 -69.89912 12.45200 1 1 Aruba <NA>
## 2 -69.89571 12.42300 1 2 Aruba <NA>
## 3 -69.94219 12.43853 1 3 Aruba <NA>
## 4 -70.00415 12.50049 1 4 Aruba <NA>
## 5 -70.06612 12.54697 1 5 Aruba <NA>
## 6 -70.05088 12.59707 1 6 Aruba <NA>
Question: What does the following code do?
world_map <- map_data("world") %>%
mutate(region = case_when(
region == "UK" ~ "United Kingdom",
region == "USA" ~ "United States",
subregion == "Hong Kong" ~ "Hong Kong",
TRUE ~ region
)
)
ggplot(world_map, aes(x = long, y = lat, group = group)) +
geom_polygon(fill = "gray") +
theme_minimal()
fisheries_map <- left_join(fisheries, world_map,
by = c("country" = "region"))
glimpse(fisheries_map)
## Rows: 72,685
## Columns: 11
## $ country <chr> "Angola", "Angola", "Angola", "Angola", "Angola", "A…
## $ capture <dbl> 486490, 486490, 486490, 486490, 486490, 486490, 4864…
## $ aquaculture <dbl> 655, 655, 655, 655, 655, 655, 655, 655, 655, 655, 65…
## $ total <dbl> 487145, 487145, 487145, 487145, 487145, 487145, 4871…
## $ continent <chr> "Africa", "Africa", "Africa", "Africa", "Africa", "A…
## $ aquaculture_perc <dbl> 0.001344569, 0.001344569, 0.001344569, 0.001344569, …
## $ long <dbl> 23.96650, 23.98828, 24.01006, 24.02559, 24.04141, 24…
## $ lat <dbl> -10.87178, -11.00283, -11.18477, -11.31563, -11.3741…
## $ group <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3…
## $ order <int> 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 43…
## $ subregion <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
ggplot(fisheries_map, mapping = aes(x = long, y = lat, group = group)) +
geom_polygon(aes(fill = capture)) +
scale_fill_viridis_c() +
theme_minimal()
Question: What is misleading about the map above?
ggplot() +
geom_polygon(world_map,
mapping = aes(x = long, y = lat, group = group),
fill = "lightgray") +
geom_polygon(fisheries_map,
mapping = aes(x = long, y = lat, group = group,
fill = capture)) +
scale_fill_viridis_c() +
theme_minimal() +
theme(legend.position = "bottom") +
labs(
x = "", y = "",
title = "Fisheries harvest by capture, 2016",
subtitle = "Capture measured in tonnes",
caption = "Source: bit.ly/2VrawTt"
)
ggplot() +
geom_polygon(world_map, mapping = aes(x = long, y = lat, group = group), fill = "lightgray") +
geom_polygon(fisheries_map, mapping = aes(x = long, y = lat, group = group, fill = log(capture))) +
scale_fill_viridis_c() +
theme_minimal() +
theme(legend.position = "bottom") +
labs(
x = "", y = "",
title = "Fisheries harvest by capture, 2016",
subtitle = "Capture measured in logged tonnes",
caption = "Source: bit.ly/2VrawTt"
)
ggplot() +
geom_polygon(world_map, mapping = aes(x = long, y = lat, group = group), fill = "lightgray") +
geom_polygon(fisheries_map, mapping = aes(x = long, y = lat, group = group, fill = log(aquaculture+1))) +
scale_fill_viridis_c() +
theme_minimal() +
theme(legend.position = "bottom") +
labs(
x = "", y = "",
title = "Fisheries harvest by aquaculture, 2016",
subtitle = "Aquaculture measured in logged tonnes",
fill = "log(aquaculture)",
caption = "Source: bit.ly/2VrawTt"
)
fisheries_map <- fisheries_map %>%
mutate(
aquaculture_perc = aquaculture / total
)
ggplot() +
geom_polygon(world_map,
mapping = aes(x = long, y = lat, group = group),
fill = "lightgray") +
geom_polygon(fisheries_map,
mapping = aes(x = long, y = lat, group = group,
fill = aquaculture_perc)) +
scale_fill_viridis_c(labels = percent_format(accuracy = 1)) +
theme_minimal() +
theme(legend.position = "bottom", legend.key.width = unit(2, "lines")) +
labs(
x = "", y = "",
title = "Average share of aquaculture by country",
subtitle = "out of total fisheries harvest, 2016",
caption = "Source: bit.ly/2VrawTt",
fill = "Aquaculture %"
)